import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
calendar = pd.read_csv('../data/calendar.csv')
calendar.dtypes
calendar.isnull().sum()
calendar.date = pd.to_datetime(calendar.date)
calendar.price = calendar.price.str.replace('\$|\,', '').astype(float)
calendar.head(1)
listings = pd.read_csv('../data/listings.csv')
listings[['id', 'last_scraped', 'name', 'description', 'neighborhood_overview', 'host_id', 'host_name', 'host_since',
'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state',
'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
'bed_type', 'amenities', 'price', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating',
'review_scores_accuracy', 'review_scores_location', 'reviews_per_month']].dtypes
listings.price = listings.price.str.replace('\$|\,', '').astype(float)
listings.shape
listings.sample(1)
reviews = pd.read_csv('../data/reviews.csv')
reviews.dtypes
reviews.head(1)
len(listings.id.unique())
len(listings.host_id.unique())
listings[['neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed']].isnull().sum()
calendar[calendar.available == 'f'].groupby(calendar.date.dt.month).listing_id.nunique().reset_index().rename(
columns= {'listing_id': 'Non-available rooms'}).head(1)
fig = px.line(
calendar[calendar.available == 'f'].groupby(calendar.date.dt.month).listing_id.nunique().reset_index(
).rename(columns= {'listing_id': 'Non-available rooms'}),
x= 'date', y= 'Non-available rooms', title= 'Number of accommodation rented at least once by month')
fig.show()
calendar[calendar.available == 'f'].groupby(calendar.date.dt.month).listing_id.count().reset_index(
).rename(columns= {'listing_id': 'Number of days rented'}).head(1)
fig = px.line(
calendar[calendar.available == 'f'].groupby(calendar.date.dt.month).listing_id.count().reset_index(
).rename(columns= {'listing_id': 'Number of days rented'}),
x= 'date', y= 'Number of days rented', title= 'Number of days rented by month')
fig.show()
# Create Figure
px.set_mapbox_access_token(open("../mapbox_token.txt").read())
fig = px.scatter_mapbox(listings.sample(500),
lat= "latitude", lon= "longitude", color= "price", hover_name= 'id',
zoom=10, width= 550, height= 650, title= 'Price by accommodation')
fig.show()
plt.Figure()
listings.price.plot.hist(bins= 50, figsize= (15, 5))
plt.xlabel('Price')
plt.ylabel('Frequency');
#plt.savefig('frequency_price.png')
distortions = []
K = range(1, 20)
for k in K:
kmeanModel = KMeans(n_clusters= k)
scaler = MinMaxScaler()
kmeanModel.fit(scaler.fit_transform(listings[['latitude', 'longitude', 'price']]))
distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
#plt.savefig('elbow_method.png')
plt.show()
# Create K-Means
k = 6
kmeanModel = KMeans(n_clusters=k)
scaler = MinMaxScaler()
kmeanModel.fit(scaler.fit_transform(listings[['latitude', 'longitude', 'price']]))
# Execute clusters
listings['price_cluster'] = kmeanModel.predict(scaler.transform(listings[['latitude', 'longitude', 'price']]))
# Create Figure
import plotly.express as px
px.set_mapbox_access_token(open("../mapbox_token.txt").read())
fig = px.scatter_mapbox(
lat= listings["latitude"], lon= listings["longitude"], color= listings["price_cluster"].astype(str),
zoom= 10, width= 550, height= 650)
fig.show()
# Create K-Means
k = 7
kmeanModel = KMeans(n_clusters=k)
scaler = MinMaxScaler()
kmeanModel.fit(scaler.fit_transform(listings[['latitude', 'longitude', 'price']]))
# Execute clusters
listings['price_cluster'] = kmeanModel.predict(scaler.transform(listings[['latitude', 'longitude', 'price']]))
# Create Figure
import plotly.express as px
px.set_mapbox_access_token(open("../mapbox_token.txt").read())
fig = px.scatter_mapbox(
lat= listings["latitude"], lon= listings["longitude"], color= listings["price_cluster"].astype(str),
zoom= 10, width= 550, height= 650)
fig.show()
listings.groupby(['price_cluster']).agg(
{
'id': 'count',
'price': ['min', 'max', 'mean'],
'accommodates': 'mean',
'bathrooms': 'mean',
'bedrooms': 'mean',
'beds': 'mean'
}
).round(1)
calendar[calendar.available == 't'].groupby(calendar.date.dt.month).price.mean().reset_index().head(1)
fig = px.line(
calendar[calendar.available == 't'].groupby(calendar.date.dt.month).price.mean().reset_index(
).rename(columns= {'date': 'Month', 'price': 'Mean Price'}),
x= 'Month', y= 'Mean Price', title= 'Mean Price')
fig.show()
max_price_by_month = calendar.groupby([calendar.listing_id, calendar.date.dt.month]).price.max().reset_index()
max_price_by_month = max_price_by_month.merge(listings[['id', 'price']], how= 'left', left_on= 'listing_id', right_on= 'id')[[
'listing_id', 'date', 'price_x', 'price_y']].rename(columns= {'price_x': 'room_price_by_month', 'price_y': 'room_price'})
max_price_by_month['price_growth'] = max_price_by_month['room_price_by_month'] / max_price_by_month['room_price']
max_price_by_month.groupby(['date']).price_growth.mean().reset_index().head(1)
fig = px.line(
max_price_by_month.groupby(['date']).price_growth.mean().reset_index(
).rename(columns= {'date': 'Month', 'price_growth': 'Price Growth'}),
x= 'Month', y= 'Price Growth', title= 'Price Growth by Month')
fig.show()
max_price_by_neighbourhood = max_price_by_month.merge(
listings[['id', 'neighbourhood_cleansed']],
how= 'left',
left_on= 'listing_id',
right_on= 'id'
).drop(
columns = ['id']
)
max_price_by_neighbourhood.groupby('neighbourhood_cleansed').price_growth.max(
).sort_values(ascending= False).reset_index().head(1)
top_price_growth_neighbourhood = max_price_by_neighbourhood.groupby('neighbourhood_cleansed').price_growth.max(
).sort_values(ascending= False).reset_index().head(10).neighbourhood_cleansed.to_list()
px.set_mapbox_access_token(open("../mapbox_token.txt").read())
fig = px.scatter_mapbox( #[listings.neighbourhood_cleansed == 'Georgetown']
listings[listings.neighbourhood_cleansed.isin(top_price_growth_neighbourhood)].groupby(
'neighbourhood_cleansed')[['latitude', 'longitude', 'price']].mean().reset_index(),
lat= "latitude", lon= "longitude", color= 'price', size= 'price',
zoom= 10, width= 550, height= 650, title= 'Top 10 neighborhoods with the highest price increase')
fig.show()